import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import display, Markdown, Latex
## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings("ignore")
The linear regression model is expressed as follows $$Y= \beta_0 + \sum_{j=1}^{p}\beta_{j} X_{j},$$ where $X_j$ represents the j-th predictor and $\beta_j$ quantifies the association between that variable and the response.
In case that $p=2$, $\beta_0$ and $\beta_1$ are known the intercept and slope terms, respectively.
In practice, we cannot identify $\beta_0,~\beta_1,~\ldots,~\beta_p$. Instead, we can have estimates $\hat{\beta}_0,\hat{\beta}_1, \ldots, \hat{\beta}_p$. Given estimates $\hat{\beta}_0$, $\hat{\beta}_1$, $\ldots$, $\hat{\beta}_p$, the following formula can be used for predictions, $$\hat{y}=\hat{\beta}_0 +\hat{\beta}_1x_1 +\hat{\beta}_2 x_2 + \ldots +\hat{\beta}_p x_p.$$
Using the least-squares approach, $\beta_0$, $\beta_1$, $\ldots$, $\beta_p$ can be chosen to minimize the sum of squared residuals
\begin{align} RSS=\sum_{i=1}^{n}(y_i-\hat{y}_i)^2=\sum_{i=1}^{n}(y_i-\hat{\beta}_0 - \hat{\beta}_1 x_{i1} - \hat{\beta}_2 x_{i2} - \hat{\beta}_3 x_{i3}-\ldots - \hat{\beta}_p x_{ip})^2. \end{align}There are a few definitions that we need throughout this document.
The $R^2$ statistic is a measure of the linear relationship between $X$ and $Y$. To calculate $R^2$ , we use the formula $$R^2 =1-\frac{\sum_{i=1}^{n}(y_i − \hat{y}_i) 2}{\sum_{i=1}^{n}(y_i − \bar{y}) 2}$$
It is a matrix in which i-j position defines the correlation between the ith and jth parameter of the given data-set. Defined as correlation $$Cor(X,Y )=\frac{\sum_{i=1}^{n}(x_i − \bar{x})(y_i − \bar{y})}{\sqrt{\sum_{i=1}^{n}(x_i − \bar{x})^2}\sqrt{\sum_{i=1}^{n}(y_i − \bar{y})^2}} $$
Correlation Matrix is basically a covariance matrix. Also known as the auto-covariance matrix, dispersion matrix, variance matrix, or variance-covariance matrix.
Consider the following example.
advertising = pd.read_csv('Data/Advertising.csv', usecols=list(range(0,4)))
advertising.head()
Here, $\beta_j$ is interpreted as the average effect on $Y$ of a one unit increase in $X_j$, holding all other predictors fixed. We have $$sales = \beta_0 + \beta_1 × TV + \beta_2 \times radio + \beta_3 \times newspaper + \epsilon. $$
$\beta_0$, $\beta_1$, $\beta_2$ and $\beta_3$
Results = smf.ols('Sales ~ TV + Radio + Newspaper', advertising).fit()
Results.summary().tables[1]
As can be seen,
For a fixed amount of TV and newspaper advertising, spending an additional $1,000 on radio advertising can increase sales by approximately 189 units.
For a fixed amount of Radio and newspaper advertising, spending an additional $1,000 on TV advertising can increase sales by approximately 46 units.
However, the coefficient estimate for the newspaper is very close to zero, and the corresponding p-value is insignificant.
RSS:
# RSS with regression coefficients
RSS = ((advertising.Sales - (Results.params[0] + Results.params[1]*advertising.TV
+ Results.params[1]*advertising.Radio
+ Results.params[2]*advertising.Newspaper))**2).sum()/1000
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'RSS with regression coefficients' + Style.RESET_ALL + ' = %.4f' % RSS)
del RSS
Correlation Matrix and plot:
display(advertising.corr().style.background_gradient(cmap='RdBu').set_precision(2))
def Correlation_Plot (Df,Fig_Size):
Correlation_Matrix = Df.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("RdBu", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .7})
bottom, top = ax.get_ylim()
Correlation_Plot (advertising,6)
We can see that the correlation between radio and newspaper is about 0.35. In other words, we can see a tendency to spend more on newspaper advertising in markets where more is spent on radio advertising.
Since p-value of the newspaper advertising is insignificant, we can provide a 3D diagram with sale, TV, and Radio advertising data.
$$sales = \beta_0 + \beta_1 × TV + \beta_2 \times radio$$reg = LinearRegression().fit(advertising[['Radio', 'TV']].values,advertising.Sales)
display(Latex(r'$\beta_0 = %.4f$, $\beta_1 = %.4f$, and $\beta_2 = %.4f$' % (reg.intercept_, reg.coef_[0], reg.coef_[1])))
fig = go.Figure()
fig.add_trace(go.Scatter3d(x= advertising.Radio, y= advertising.TV, z= advertising.Sales,
mode='markers', marker=dict(size=4, color=advertising.Sales, colorscale='Blues', opacity=0.8)))
# Creating a coordinate grid
X, Y = np.meshgrid(np.arange(0, np.ceil(advertising.Radio.max())),
np.arange(0, np.ceil(advertising.TV.max())), indexing='xy')
Z = np.zeros(X.shape)
for (i,j),v in np.ndenumerate(Z):
Z[i,j] =(reg.intercept_ + X[i,j]*reg.coef_[0] + Y[i,j]*reg.coef_[1])
fig.add_trace(go.Surface(z=Z, x=X, y=Y, colorscale='Greens', colorbar_len=0.6))
# tight layout
fig.update_layout(title="Regression: Sales ~ Radio + TV Advertising", xaxis_title="Radio", yaxis_title="TV")
fig['layout']['xaxis'].update(range=[0, 50])
fig['layout']['yaxis'].update(range=[0, 300])
fig.show()
del X, Y, Z
Note that we do not have to define our linear model in additive way. For example we can define a linear model that uses radio, TV, and an interaction between the two to predict sales takes the form \begin{align} sales &= \beta_0 + \beta_1 \times TV + \beta_2 \times radio + \beta_3 \times ( radio \times TV ) + \epsilon\\ &= \beta_0 + (\beta_1 + \beta_3 \times radio ) \times TV + \beta_2 \times radio + \epsilon. \end{align}
Results = smf.ols('Sales ~ TV + Radio + TV*Radio', advertising).fit()
Results.summary().tables[1]
The results clearly suggest that the model that includes the interaction term is superior to the model that contains only the main effects. Since the p-value for the interaction term, TV x radio, is noticeably low, it is clear that the true relationship is not additive.
credit = pd.read_csv('Data/Credit.csv', usecols=list(range(0,11)))
credit.head(5)
The Credit data set contains information about balance, age, cards, education, income, limit, and rating for a number of potential customers.
sns.pairplot(credit[['Balance','Age','Cards','Education','Income','Limit','Rating']]);
Each panel of the Figure is a scatterplot for a pair of variables whose identities are given by the corresponding row and column labels.
For example, the scatterplot directly to the right of the word “Balance” depicts balance versus age , while the plot directly to the right of “Age” corresponds to age versus cards . In addition to these quantitative variables, we also have four qualitative variables: gender , student (student status), status (marital status), and ethnicity (Caucasian, African American or Asian).
We simply create an indicator or dummy variable that takes on two possible dummy variable numerical values (binary values).
For example, based on the gender variable, we can create a new binary variable that takes the form $$x_i =\begin{cases}1, & \mbox{if ith person is female}, \\0, & \mbox{if ith person is male},\end{cases}$$
and the regression equation:
$$y_i = \beta_0 + \beta_1 x i + \epsilon_i =\begin{cases}\beta_0 + \beta_1 + \epsilon_i, & \mbox{if ith person is female}, \\\beta_0 + \epsilon_i , & \mbox{if ith person is male.}\end{cases}$$Results = smf.ols('Balance ~ C(Gender)', credit).fit()
Results.summary().tables[1]
Results = smf.ols('Balance ~ C(Ethnicity)', credit).fit()
Results.summary().tables[1]
Results = smf.ols('Balance ~ C(Married)', credit).fit()
Results.summary().tables[1]
There is an interaction term between income and student. We have
\begin{align} \text{balance}_i \approx \beta_0 + \beta_1 \times \text{income}_i + \begin{cases} \beta_2, & \mbox{if ith person is a student},\\0, & \mbox{if ith person is not a student} \\ \end{cases} =\beta_1 \times \text{income}_i + \begin{cases} \beta_0 + \beta_2, & \mbox{if ith person is a student},\\ \beta_0, & \mbox{if ith person is not a student} \\ \end{cases} \end{align} \begin{align} \text{balance}_i &\approx \beta_0 + \beta_1 \times \text{income}_i + \begin{cases} \beta_2+\beta_3 \times \text{income}_i, & \mbox{if ith person is a student},\\0, & \mbox{if ith person is not a student} \\ \end{cases}\\ &=\begin{cases} (\beta_0+\beta_2)+(\beta_1+\beta_3) \times \text{income}_i, & \mbox{if ith person is a student},\\ \beta_0 + \beta_1 \times \text{income}_i, & \mbox{if ith person is not a student} \\ \end{cases} \end{align}Regression 1 - without interaction term:
Results1 = smf.ols('Balance ~ Income + Student', credit).fit()
reg1 = Results1.params
Results1.summary().tables[1]
Regression 2 - with interaction term:
Results2 = smf.ols('Balance ~ Income + Income*Student', credit).fit()
reg2 = Results2.params
Results2.summary().tables[1]
fig = go.Figure()
fig = make_subplots(rows=1, cols=2)
# x-axis (Income)
Max_Income = np.ceil(credit['Income'].max())
income = np.linspace(0, Max_Income)
# y-axis (Balance without interaction term)
st1 = np.linspace(reg1['Intercept']+reg1['Student[T.Yes]'],
reg1['Intercept']+reg1['Student[T.Yes]'] + Max_Income*reg1['Income'])
non_st1 = np.linspace(reg1['Intercept'], reg1['Intercept'] + Max_Income*reg1['Income'])
# y-axis (Balance with iteraction term)
st2 = np.linspace(reg2['Intercept']+reg2['Student[T.Yes]'],
reg2['Intercept']+reg2['Student[T.Yes]']+Max_Income*(reg2['Income']+reg2['Income:Student[T.Yes]']))
non_st2 = np.linspace(reg2['Intercept'], reg2['Intercept']+Max_Income*reg2['Income'])
# Left,
fig.add_trace(go.Scatter(x= income, y= st1, line=dict(color='OrangeRed', width= 1.5),
name = 'Student'), 1, 1)
fig.add_trace(go.Scatter(x= income, y= non_st1, line=dict(color='MidnightBlue', width= 1.5),
name = 'Non-Student'), 1, 1)
# right
fig.add_trace(go.Scatter(x= income, y= st2, line=dict(color='OrangeRed', width= 1.5),
name = 'Student', showlegend = False), 1, 2)
fig.add_trace(go.Scatter(x= income, y= non_st2, line=dict(color='MidnightBlue', width= 1.5),
name = 'Non-Student', showlegend = False), 1, 2)
# Update
fig.update_xaxes(title_text='Income', range=[0, 2e2], row=1, col=1)
fig.update_xaxes(title_text='Income', range=[0, 2e2], row=1, col=2)
fig.update_yaxes(title_text='Balance', range=[0, 2e3], row=1, col=1)
fig.update_yaxes(title_text='Balance', range=[0, 2e3], row=1, col=2)
fig.update_layout(title_text="Balance with iteraction term", legend_title_text='Status')
fig.show()
James, G., Witten, D., Hastie, T., & Tibshirani, R. (2013). An introduction to statistical learning (Vol. 112, pp. 3-7). New York: springer.
Jordi Warmenhoven, ISLR-python